Main Figures
Figure 1
Figure 1a
Diagram of plasmid titration experiment
Figure 1b
Chromatograms from incremental titration C>T
Figure 1c
Scatter plot from titration experiment
titrations = unique(figure_1.df$titration)
figure_1c = plotTitration(figure_1.df , titration_index = 2, "T", "C")
figure_1c
Figure 1d
MultiEditR algorithm
Figure 1e
MultiEditR vs. REDItools scatterplot
figure_1e_perc_cutoff = 0.01
figure_1e = comparison.df %>%
filter(ngs_perc >= figure_1e_perc_cutoff & ngs_perc <= 1-figure_1e_perc_cutoff) %>%
filter(sanger_perc >= figure_1e_perc_cutoff & sanger_perc <= 1-figure_1e_perc_cutoff) %>%
plotResults(., var1 = "sanger_perc", var2 = "ngs_perc") %>%
.[[1]] +
xlab("RNA-seq (REDitools)") +
ylab("Sanger sequencing (MultiEditR)")
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
figure_1e
Figure 1f
figure_1f_cor_data = comparison.df %$%
cor.test(x = abs(diff), y = Reads, method = "spearman") %>%
unlist() %>%
.[c(2, 3)] %>%
as.numeric() %>%
signif(3)
## Warning in cor.test.default(x = abs(diff), y = Reads, method = "spearman"):
## Cannot compute exact p-value with ties
figure_1f_correlation_label = paste0("ρ = ", figure_1f_cor_data[2], ", P = ", figure_1f_cor_data[1])
figure_1f = comparison.df %>%
ggplot(aes(x = Reads, y = abs(diff))) +
scale_fill_manual(values = c("A" = "#4daf4a", "C" = "#377eb8", "G" = "#999999", "T" = "#e41a1c")) +
geom_point(alpha = 0.3) +
scale_x_log10(limits = c(1, 1e4), breaks = c(1, 10, 100, 1000, 10000)) +
scale_y_log10(limits = c(1e-5, 1e0), breaks = c(1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0), labels = rev(c("100.0%", "10.0%", "1.0%", "0.1%", "0.01%", "0.001%"))) +
# scale_y_continuous(breaks = seq(0, 1, 0.1)) +
# scale_y_log10() +
# geom_density_2d(aes(color = "black")) +
geom_smooth(fill = "black", method = "lm", size = 1.5) +
# scale_y_continuous(labels = scales::percent(seq(-0.5,0.5, 0.1)), limits = c(-0.5,0.5), breaks = seq(-0.5,0.5, 0.1)) +
# facet_wrap(.~Reference) +
ylab("Error = |MultiEditR - RNA-seq|") +
xlab("RNA-seq Read Depth") +
annotate(geom = "label", label = figure_1f_correlation_label, x = 2000, y = 2e-3, alpha = 0.7, size = 6) +
theme_bw(base_size = 26) +
theme(aspect.ratio = 1,
panel.grid.major = element_blank(), panel.grid.minor = element_blank())
figure_1f
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 576 rows containing non-finite values (stat_smooth).
Figure 1g
figure_1g = ggplot() +
geom_histogram(data = comparison.df %>% filter(abs(diff) <= 0.001), aes(x = Reads, y = -(..count..)/828), fill="grey30", color = "black") +
geom_histogram(data = comparison.df %>% filter(abs(diff) >= 0.9), aes(x = Reads, y = (..count..)/336), fill="grey", color = "black") +
scale_x_log10(limits = c(1, 1e4), breaks = c(1, 10, 100, 1000, 10000)) +
scale_y_continuous(limits = c(-0.2, 0.2), breaks = seq(-0.2, 0.2, 0.1), labels = c(0.2, 0.1, 0, 0.1, 0.2)) +
geom_label( aes(x=3.5, y=0.19, label="Error ≥ 90%"), size = 6) +
geom_label( aes(x=5, y=-0.19, label="Error ≤ 0.1%"), size = 6) +
theme_bw(26) +
ylab("Relative Density") +
xlab("RNA-seq Read Depth") +
theme(aspect.ratio = 1, panel.grid.major = element_blank(), panel.grid.minor = element_blank())
figure_1g
## Warning: Removed 3 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
Figure 2
Figure 2a
Diagram of three-way comparison workflow
Figure 2b
MultiEditR vs. Amplicon-seq scatter plot
figure_2b = filtered_amplicon_merged_data %>%
plotResults(., var1 = "sanger_perc", var2 = "ngs_perc") %>%
.[[1]] +
ylab("Sanger sequencing (MultiEditR)") +
xlab("RNA-seq (REDitools)")
figure_2b
Figure 2c
MultiEditR vs. Amplicon-seq boxplot
figure_2c = filtered_amplicon_merged_data %>%
plotResults(., var1 = "sanger_perc", var2 = "amplicon_perc") %>%
.[[3]] +
theme(legend.position = "none") +
ylab("Inaccuracy (MultiEditR - Amplicon-seq)")
figure_2c
Figure 2d
REDItools vs. Amplicon-seq scatter plot
figure_2d = filtered_amplicon_merged_data %>%
plotResults(., var1 = "ngs_perc", var2 = "amplicon_perc") %>%
.[[1]] +
ylab("RNA-seq (REDitools)") +
xlab("Amplicon-seq (REDitools)")
figure_2d
Figure 2e
REDItools vs. Amplicon-seq boxplot
figure_2e = filtered_amplicon_merged_data %>%
plotResults(., var1 = "ngs_perc", var2 = "amplicon_perc") %>%
.[[3]] +
theme(legend.position = "none") +
ylab("Inaccuracy (RNA-seq - Amplicon-seq)")
figure_2e
Figure 2f
Accuracy at inclusion criteria
figure_2f = lapply(FUN = analyzeFigure2f, X = c(0.01, 0.05, 0.1), amplicon_merged_data) %>%
plyr::ldply(., "data.frame") %>%
ggplot(aes(x = value, fill = Metric)) +
geom_density(adjust = 2, alpha = 0.5) +
scale_x_continuous(limits = c(-0.5, 0.5), breaks = seq(-0.4, 0.4, 0.1), labels = c("-40%", "", "-20%", "", "0%", "", "20%", "", "40%")) +
geom_vline(xintercept = 0, linetype = "dashed") +
theme_bw(base_size = 18) +
ylab("Density") +
xlab("Inaccuracy (Metric - Amplicon-seq)") +
facet_grid(.~Threshold, scale = "free_y") +
theme_bw(base_size = 16) +
theme(aspect.ratio = 1,
panel.grid = element_blank())
figure_2f
lapply(FUN = analyzeFigure2f, X = c(0.01, 0.05, 0.1), amplicon_merged_data) %>%
plyr::ldply(., "data.frame") %>%
group_by(Metric) %>%
dplyr::summarise(Min = min(value), Max = max(value)) %>%
ungroup() %>%
mutate_if(is.numeric, scales::percent)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## Metric Min Max
## <chr> <chr> <chr>
## 1 MultiEditR -22.4% 21.0%
## 2 RNA-seq -26.9% 26.9%
Figure 2g
Confusion matrix
Figure 2h
ROC analysis at 1%, 5%, and 10%
figure_2h_thresholds = c(0.01, 0.05, 0.1)
figure_2h_sanger_data = classification.df %>%
lapply(FUN = analyzeROC,
data = .,
classifier_var = "sanger_pvalue",
label_var = "amplicon_perc", X = figure_2h_thresholds) %>%
plotROC(., figure_2h_thresholds)
figure_2h_rnaseq_data = classification.df %>%
lapply(FUN = analyzeROC,
data = .,
classifier_var = "fisher_pvalue",
label_var = "amplicon_perc", X = figure_2h_thresholds) %>%
plotROC(., figure_2h_thresholds)
figure_2h_ROC_data = bind_rows(figure_2h_sanger_data[[2]], figure_2h_rnaseq_data[[2]]) %>%
mutate(threshold = paste0(threshold, " editing threshold")) %>%
mutate(classifier_var = factor(classifier_var, levels = c("sanger_pvalue", "fisher_pvalue"))) %>%
mutate(threshold = factor(threshold, levels = c("1.0% editing threshold", "5.0% editing threshold", "10.0% editing threshold")))
figure_2h_classification.df = bind_rows(figure_2h_sanger_data[[3]], figure_2h_rnaseq_data[[3]]) %>%
dplyr::rename(threshold = 1) %>%
mutate(threshold = paste0(threshold, " editing threshold")) %>%
mutate(classifier_var = factor(classifier_var, levels = c("sanger_pvalue", "fisher_pvalue"))) %>%
mutate(threshold = factor(threshold, levels = c("1.0% editing threshold", "5.0% editing threshold", "10.0% editing threshold")))
figure_2h = figure_2h_ROC_data %>%
ggplot(aes(x = fpr, y = tpr, color = threshold)) +
geom_line(aes(linetype = classifier_var), size = 1.1, alpha = 0.7) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
geom_point(data = figure_2h_classification.df, aes(x = fnr, y =Sensitivity), size = 4, pch = 4) +
geom_point(data = figure_2h_classification.df, aes(x = fnr, y =Sensitivity), size = 2) +
labs(color = "Limit of Detection", linetype = "Method") +
scale_color_manual(values = c("#1b9e77", "#d95f02", "#7570b3")) +
scale_linetype_manual(values = c("fisher_pvalue" = 3, "sanger_pvalue" = 1), labels = c("MultiEditR", "REDitools")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0,1, 0.2), labels = scales::percent_format()) +
scale_x_continuous(limits = c(0, 1), breaks = seq(0,1, 0.2), labels = scales::percent_format()) +
annotate(geom = "text", x = 0.5, y = 0.43, label = "Line of random guessing", size = 10, angle = 45) +
annotate(geom = "point", x = 0, y = 1, size = 4, pch = 4) +
annotate(geom = "point", x = 0, y = 1, size = 2) +
theme_bw(base_size = 36) +
ylab("True Positive Rate (Sensitivity)") +
xlab("False Positive Rate (1-Specificity)") +
facet_grid(.~threshold) +
theme(panel.grid = element_blank(), aspect.ratio = 1, strip.background = element_blank(), strip.text = element_text(size = 36))
figure_2h
Figure 2i
Sensitivity, Specificity and AUC
figure_2i_thresholds = c(0.001, 0.005, 0.01, 0.05, 0.10, 0.25)
figure_2i_sanger_data = classification.df %>%
lapply(FUN = analyzeROC,
data = .,
classifier_var = "sanger_pvalue",
label_var = "amplicon_perc", X = figure_2i_thresholds) %>%
plotROC(., figure_2i_thresholds)
figure_2i_rnaseq_data = classification.df %>%
lapply(FUN = analyzeROC,
data = .,
classifier_var = "fisher_pvalue",
label_var = "amplicon_perc", X = figure_2i_thresholds) %>%
plotROC(., figure_2i_thresholds)
figure_2i_classification.df = bind_rows(figure_2i_sanger_data[[3]], figure_2i_rnaseq_data[[3]]) %>%
dplyr::rename(threshold = 1) %>%
mutate(threshold = as.numeric(gsub("%", "", as.character(threshold)))/100) %>%
mutate(classifier_var = factor(classifier_var, levels = rev(c("sanger_pvalue", "fisher_pvalue"))))
figure_2i = figure_2i_classification.df %>%
dplyr::rename(`Matthew's Correlation\nCoefficient (MCC)` = MCC, `Area Under Curve (AUC)` = AUC) %>%
gather(., key = "metric", value = "value", c(Sensitivity:`Matthew's Correlation\nCoefficient (MCC)`)) %>%
mutate(metric = factor(metric, levels = c("Sensitivity", "Specificity", "Area Under Curve (AUC)", "Youden's J", "Matthew's Correlation\nCoefficient (MCC)"))) %>%
filter(metric %in% c("Sensitivity", "Specificity", "Area Under Curve (AUC)")) %>%
ggplot(aes(x = threshold, y = value, fill = classifier_var, color = classifier_var)) +
annotate(geom = "rect", xmin = 0.04, xmax = 0.3, ymin = 0, ymax = 1.05, color = "black", alpha = 0) +
geom_line(stat = "identity", size = 1) +
geom_point(pch = 21, size = 3, color = "black") +
annotate(geom = "text", label = "Best use of\nMultiEditR", x = 0.110, y = 0.1, size = 8) +
scale_x_log10(labels = scales::percent_format(accuracy = 0.1), breaks = figure_2i_thresholds) +
scale_y_continuous(limits = c(0,1.05), breaks = seq(0,1, 0.2)) +
scale_color_manual(values = c("fisher_pvalue" = "#1b9e77", "sanger_pvalue" = "#7570b3"), labels = c("REDitools", "MultiEditR")) +
scale_fill_manual(values = c("fisher_pvalue" = "#1b9e77", "sanger_pvalue" = "#7570b3"), labels = c("REDitools", "MultiEditR")) +
ylab("Value of metric") +
xlab("Editing detection threshold") +
# geom_bar(stat = "identity", position = "dodge") +
labs(color = "Method", fill = "Method") +
theme_bw(base_size = 36) +
facet_grid(.~metric) +
theme(panel.grid = element_blank(),
strip.background = element_blank(),
axis.text.x = element_text(hjust = 1, angle = 45),
aspect.ratio = 1,
strip.text = element_text(size = 36))
figure_2i
Figure 3
Figure 3a
Definition of MEEI
Figure 3b
RNA-seq AEI vs. MEEI
figure_3b_data = comparison.df %>%
dplyr::group_by(sample_file) %>%
dplyr::mutate(ave_reads = mean(Reads), ReferenceBase = which.frequent(Reference)) %>%
ungroup() %>%
filter(Reference == base, Reference == ReferenceBase) %>%
dplyr::select(sample_file, EI_ngs, EI_sanger, Reference, Region, ave_reads) %>%
dplyr::distinct()
figure_3b_cor_data = figure_3b_data %$%
cor.test(x = EI_ngs, y = EI_sanger) %>%
unlist() %>%
.[c(3, 4)] %>%
as.numeric() %>%
signif(3)
figure_3b_correlation_label = paste0("r = ", figure_3b_cor_data[2], ", P = ", figure_3b_cor_data[1])
figure_3b = figure_3b_data %>%
ggplot(aes(y = EI_sanger, x = EI_ngs, fill = ave_reads)) +
geom_point(pch = 21, color = "black", alpha = 0.8, size = 4) +
geom_smooth(method = "lm", color = "black") +
theme_bw(base_size = 18) +
theme(aspect.ratio = 1,
panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
labs(fill = "Average Read Depth") +
ylab("MultiEditR Editing Index (MEI)") +
xlab("RNA-seq Editing Index (EI)\nRoth et al., 2019") +
annotate(geom = "label", label = figure_3b_correlation_label, x = 0.04, y = 0.04, size = 6)
figure_3b
Figure 3c
Amplicon-seq AEI vs. MEEI
figure_3c_data = amplicon_merged_data %>%
dplyr::group_by(sample_file) %>%
dplyr::mutate(ave_reads = mean(Reads), ReferenceBase = which.frequent(Reference)) %>%
ungroup() %>%
filter(Reference == base, Reference == ReferenceBase) %>%
dplyr::select(sample_file, EI_ngs, EI_sanger, Reference, Region, ave_reads) %>%
dplyr::distinct()
figure_3c_cor_data = figure_3c_data %$%
cor.test(x = EI_ngs, y = EI_sanger) %>%
unlist() %>%
.[c(3, 4)] %>%
as.numeric() %>%
signif(3)
figure_3c_correlation_label = paste0("r = ", figure_3c_cor_data[2], ", P = ", figure_3c_cor_data[1])
figure_3c = figure_3c_data %>%
ggplot(aes(y = EI_sanger, x = EI_ngs, fill = ave_reads)) +
geom_point(pch = 21, color = "black", alpha = 0.8, size = 4) +
geom_smooth(method = "lm", color = "black") +
theme_bw(base_size = 18) +
theme(aspect.ratio = 1,
panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
# scale_fill_gradient(low = "#132B43", high = "#41ae76") +
labs(fill = "Average Read Depth") +
ylab("MultiEditR Editing Index (MEI)") +
xlab("Amplicon-seq Editing Index (EI)\nRoth et al., 2019") +
annotate(geom = "label", label = figure_3c_correlation_label, x = 0.04, y = 0.04, size = 6)
figure_3c
Figure 3d
Diagram of 4LN system
Figure 3e
Diagram of experiment
Figure 3f
Flow and MultiEditR data
figure_3f = lambda.df %>%
gather(Metric, Percent, target_edit:gfp) %>%
ggplot(aes(x = enzyme, y = Percent, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
scale_fill_manual(values = rev(c("#4A9BD9", "#41ae76")), labels = c("Flow Cytometry", "MultiEditR")) +
scale_y_continuous(labels = scales::percent_format()) +
ylab("Percent Conversion") +
xlab("") +
labs(fill = "Metric") +
theme_bw(base_size = 24) +
theme(aspect.ratio = 3.5,
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank()) +
facet_grid(.~guide, space = "free_x", scale = "free_x")
figure_3f
Figure 3g
Target Edit to background editing ratio
figure_3g = lambda.df %>%
ggplot(aes(x = enzyme, y = norm_target_MEI_ratio)) +
geom_bar(stat = "identity", position = "dodge", color = "black", fill = c("#1b9e77", "#d95f02", "#7570b3")[3]) +
scale_y_continuous(limits = c(0,30), breaks = c(0,5,10,15,20,25,30)) +
ylab("Target Editing to MEI ratio\n(normalized to CAGX alone) ") +
xlab("") +
theme_bw(base_size = 30) +
geom_hline(yintercept = 1, linetype = "dashed") +
theme(aspect.ratio = 3.5,
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank()) +
facet_grid(.~guide, space = "free_x", scale = "free_x")
figure_3g
Figure 3h
Diagram of T-cell experiments
Figure 3i
NGS data
tcell.df = tcell.df %>%
mutate(NGS = NGS/100, Sanger = Sanger/100, Flow = Flow/100) %>%
group_by(gRNA, Gene, Enzyme) %>%
dplyr::summarize(NGS = mean(NGS), Sanger = mean(Sanger), Flow = mean(Flow))
figure_3i = plotFigure4("Sanger", "NGS", tcell.df, 0.8, 0.8) +
xlab("Sanger sequencing (MultiEditR)") +
ylab("NGS (CRISPR-DAV)")
figure_3i
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
Figure 3k
Protein KO data
figure_3k = plotFigure4("Sanger", "Flow", tcell.df, 0.8, 0.8) +
ylab("Protein Knock-out") +
xlab("Sanger sequencing (MultiEditR)")
figure_3k
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).